{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(700, 20) (700,) (300, 20) (300,)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "# 该模块为自定义模块，封装了构建决策树的基本方法\n",
    "from cart import *\n",
    "from sklearn.datasets import make_classification\n",
    "from sklearn.model_selection import train_test_split\n",
    "# 树的棵数\n",
    "n_estimators = 10\n",
    "# 列抽样最大特征数\n",
    "max_features = 15\n",
    "# 生成模拟二分类数据集\n",
    "X, y = make_classification(n_samples=1000, n_features=20, n_redundant=0, n_informative=2,\n",
    "                           random_state=1, n_clusters_per_class=1)\n",
    "rng = np.random.RandomState(2)\n",
    "X += 2 * rng.uniform(size=X.shape)\n",
    "# 划分数据集\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)\n",
    "print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 合并训练数据和标签\n",
    "X_y = np.concatenate([X, y.reshape(-1,1)], axis=1)\n",
    "np.random.shuffle(X_y)\n",
    "m = X_y.shape[0]\n",
    "sampling_subsets = []\n",
    "\n",
    "for _ in range(n_estimators):\n",
    "    idx = np.random.choice(m, m, replace=True)\n",
    "    bootstrap_Xy = X_y[idx, :]\n",
    "    bootstrap_X = bootstrap_Xy[:, :-1]\n",
    "    bootstrap_y = bootstrap_Xy[:, -1]\n",
    "    sampling_subsets.append([bootstrap_X, bootstrap_y])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 20)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sampling_subsets[0][0].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 自助抽样选择训练数据子集\n",
    "def bootstrap_sampling(X, y):\n",
    "    X_y = np.concatenate([X, y.reshape(-1,1)], axis=1)\n",
    "    np.random.shuffle(X_y)\n",
    "    n_samples = X.shape[0]\n",
    "    sampling_subsets = []\n",
    "\n",
    "    for _ in range(n_estimators):\n",
    "        # 第一个随机性，行抽样\n",
    "        idx1 = np.random.choice(n_samples, n_samples, replace=True)\n",
    "        bootstrap_Xy = X_y[idx1, :]\n",
    "        bootstrap_X = bootstrap_Xy[:, :-1]\n",
    "        bootstrap_y = bootstrap_Xy[:, -1]\n",
    "        sampling_subsets.append([bootstrap_X, bootstrap_y])\n",
    "    return sampling_subsets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(700, 20) (700,)\n"
     ]
    }
   ],
   "source": [
    "sampling_subsets = bootstrap_sampling(X_train, y_train)\n",
    "sub_X, sub_y = sampling_subsets[0]\n",
    "print(sub_X.shape, sub_y.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<cart.ClassificationTree at 0x1d5e15aaa20>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trees = []\n",
    "# 基于决策树构建森林\n",
    "for _ in range(n_estimators):\n",
    "    tree = ClassificationTree(min_samples_split=2, min_gini_impurity=999,\n",
    "                              max_depth=3)\n",
    "    trees.append(tree)\n",
    "\n",
    "trees[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The 1th tree is trained done...\n",
      "The 2th tree is trained done...\n",
      "The 3th tree is trained done...\n",
      "The 4th tree is trained done...\n",
      "The 5th tree is trained done...\n",
      "The 6th tree is trained done...\n",
      "The 7th tree is trained done...\n",
      "The 8th tree is trained done...\n",
      "The 9th tree is trained done...\n",
      "The 10th tree is trained done...\n"
     ]
    }
   ],
   "source": [
    "# 随机森林训练\n",
    "def fit(X, y):\n",
    "    # 对森林中每棵树训练一个双随机抽样子集\n",
    "    n_features = X.shape[1]\n",
    "    sub_sets = bootstrap_sampling(X, y)\n",
    "    for i in range(n_estimators):\n",
    "        sub_X, sub_y = sub_sets[i]\n",
    "        # 第二个随机性，列抽样\n",
    "        idx2 = np.random.choice(n_features, max_features, replace=True)\n",
    "        sub_X = sub_X[:, idx2]\n",
    "        trees[i].fit(sub_X, sub_y)\n",
    "        trees[i].feature_indices = idx2\n",
    "        print('The {}th tree is trained done...'.format(i+1))\n",
    "\n",
    "fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "300"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_preds = []\n",
    "for i in range(n_estimators):\n",
    "    idx = trees[i].feature_indices\n",
    "    sub_X = X_test[:, idx]\n",
    "    y_pred = trees[i].predict(sub_X)\n",
    "    y_preds.append(y_pred)\n",
    "    \n",
    "len(y_preds[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(300, 10)\n",
      "[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]\n"
     ]
    }
   ],
   "source": [
    "y_preds = np.array(y_preds).T\n",
    "print(y_preds.shape)\n",
    "y_pred = []\n",
    "for y_p in y_preds:\n",
    "    y_pred.append(np.bincount(y_p.astype('int')).argmax())\n",
    "\n",
    "print(y_pred[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.7366666666666667\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "print(accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class RandomForest():\n",
    "    def __init__(self, n_estimators=100, min_samples_split=2, min_gain=0,\n",
    "                 max_depth=float(\"inf\"), max_features=None):\n",
    "        # 树的棵树\n",
    "        self.n_estimators = n_estimators\n",
    "        # 树最小分裂样本数\n",
    "        self.min_samples_split = min_samples_split\n",
    "        # 最小增益\n",
    "        self.min_gain = min_gain\n",
    "        # 树最大深度\n",
    "        self.max_depth = max_depth\n",
    "        # 所使用最大特征数\n",
    "        self.max_features = max_features\n",
    "\n",
    "        self.trees = []\n",
    "        # 基于决策树构建森林\n",
    "        for _ in range(self.n_estimators):\n",
    "            tree = ClassificationTree(min_samples_split=self.min_samples_split, min_impurity=self.min_gain,\n",
    "                                      max_depth=self.max_depth)\n",
    "            self.trees.append(tree)\n",
    "            \n",
    "    # 自助抽样\n",
    "    def bootstrap_sampling(self, X, y):\n",
    "        X_y = np.concatenate([X, y.reshape(-1,1)], axis=1)\n",
    "        np.random.shuffle(X_y)\n",
    "        n_samples = X.shape[0]\n",
    "        sampling_subsets = []\n",
    "\n",
    "        for _ in range(self.n_estimators):\n",
    "            # 第一个随机性，行抽样\n",
    "            idx1 = np.random.choice(n_samples, n_samples, replace=True)\n",
    "            bootstrap_Xy = X_y[idx1, :]\n",
    "            bootstrap_X = bootstrap_Xy[:, :-1]\n",
    "            bootstrap_y = bootstrap_Xy[:, -1]\n",
    "            sampling_subsets.append([bootstrap_X, bootstrap_y])\n",
    "        return sampling_subsets\n",
    "            \n",
    "    # 随机森林训练\n",
    "    def fit(self, X, y):\n",
    "        # 对森林中每棵树训练一个双随机抽样子集\n",
    "        sub_sets = self.bootstrap_sampling(X, y)\n",
    "        n_features = X.shape[1]\n",
    "        # 设置max_feature\n",
    "        if self.max_features == None:\n",
    "            self.max_features = int(np.sqrt(n_features))\n",
    "        \n",
    "        for i in range(self.n_estimators):\n",
    "            # 第二个随机性，列抽样\n",
    "            sub_X, sub_y = sub_sets[i]\n",
    "            idx2 = np.random.choice(n_features, self.max_features, replace=True)\n",
    "            sub_X = sub_X[:, idx2]\n",
    "            self.trees[i].fit(sub_X, sub_y)\n",
    "            # 保存每次列抽样的列索引，方便预测时每棵树调用\n",
    "            self.trees[i].feature_indices = idx2\n",
    "            print('The {}th tree is trained done...'.format(i+1))\n",
    "    \n",
    "    # 随机森林预测\n",
    "    def predict(self, X):\n",
    "        y_preds = []\n",
    "        for i in range(self.n_estimators):\n",
    "            idx = self.trees[i].feature_indices\n",
    "            sub_X = X[:, idx]\n",
    "            y_pred = self.trees[i].predict(sub_X)\n",
    "            y_preds.append(y_pred)\n",
    "            \n",
    "        y_preds = np.array(y_preds).T\n",
    "        res = []\n",
    "        for j in y_preds:\n",
    "            res.append(np.bincount(j.astype('int')).argmax())\n",
    "        return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The 1th tree is trained done...\n",
      "The 2th tree is trained done...\n",
      "The 3th tree is trained done...\n",
      "The 4th tree is trained done...\n",
      "The 5th tree is trained done...\n",
      "The 6th tree is trained done...\n",
      "The 7th tree is trained done...\n",
      "The 8th tree is trained done...\n",
      "The 9th tree is trained done...\n",
      "The 10th tree is trained done...\n"
     ]
    }
   ],
   "source": [
    "rf = RandomForest(n_estimators=10, max_features=15)\n",
    "rf.fit(X_train, y_train)\n",
    "y_pred = rf.predict(X_test)\n",
    "print(accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.82\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\Installation\\anaconda\\install\\lib\\site-packages\\sklearn\\ensemble\\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "clf = RandomForestClassifier(max_depth=3, random_state=0)\n",
    "clf.fit(X_train, y_train)\n",
    "y_pred = clf.predict(X_test)\n",
    "print(accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
